Data Analysis
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import plotly.graph_objects as gp
import plotly.express as px
df = pd.read_csv("cleaned_dataset_2017_2020.csv")
# to find the total customers in each loyalty group
df['loyalty'].value_counts().plot.pie();
plt.savefig("Loyalty types and total customers")
df.groupby(['loyalty']).agg(total_customers = ('customer_id',pd.Series.nunique))
| total_customers | |
|---|---|
| loyalty | |
| First Time Buyer | 48 |
| Loyalist | 1339 |
| Promiscuous | 1872 |
# to find the total customers in each age_band
df.groupby(['loyalty','age_band']).agg(total_customers = ('customer_id',pd.Series.nunique)).plot(figsize = (12,5), title = 'Loyalty by age_band', kind = 'barh')
plt.ylabel('Loyalty & age_band')
plt.savefig('Loyalty by age_band')
# to find total_revenue each year
df.transaction_date = pd.to_datetime(df.transaction_date)
df['year'] = df.transaction_date.dt.year
df['month'] = df.transaction_date.dt.month
df.groupby(['year','month']).agg(total_revenue = ('price',sum)).plot(figsize = (12,5), title = 'Revenue by year');
plt.savefig('Revenue by year')
df.groupby(['month']).agg(total_revenue=('price', sum)).plot(title="Minimum and Maximum revenue by month", figsize = (12,5), legend = False);
plt.savefig("Minimum and Maximum revenue by month")
#to find the Number of Products sold by each department
df.groupby(['department']).agg(total_products=('commodity',pd.Series.nunique)).sort_values('total_products', ascending = True).plot(figsize=(10,5),kind = 'barh', title = 'Number of Products sold by each department', legend = False)
plt.savefig('Number of Products sold by each department')
top_10 = df.groupby(['commodity']).agg(total_revenue=('price',sum)) \
.sort_values('total_revenue', ascending = False).head(10)
gp.Figure(
data = gp.Bar(x=top_10.index, y=top_10['total_revenue']),
layout = gp.Layout(
title ='Top 10 commodities by Revenue',height = 400,width =500,
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
bottom_10 = df.groupby(['commodity']).agg(total_revenue=('price',sum)) \
.sort_values('total_revenue', ascending = True).head(10)
gp.Figure(
data = gp.Bar(x=bottom_10.index, y=bottom_10['total_revenue'], marker = dict(color = "red")),
layout = gp.Layout(
title ='Bottom 10 commodities by Revenue',height = 400,width =500,
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
top_5d= df.groupby(['department']).agg(total_revenue=('price',sum)) \
.sort_values('total_revenue', ascending = False).head(5)
gp.Figure(
data = gp.Bar(x=top_5d.index, y=top_5d['total_revenue']),
layout = gp.Layout(
title ='Top 5 Departments by Revenue',height = 400,width =500,
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
bottom_5d = df.groupby(['department']).agg(total_revenue=('price',sum)) \
.sort_values('total_revenue', ascending = True).head(5)
gp.Figure(
data = gp.Bar(x=bottom_5d.index, y=bottom_5d['total_revenue'], marker = dict(color = "red")),
layout = gp.Layout(
title ='Bottom 5 Departments by Revenue',height = 400,width =500,
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
df.groupby(['brand']).agg(total_products = ('product_id',pd.Series.nunique)).plot(figsize = (5,5), title = 'Products by brand', kind = 'barh');
Analysis by Loyalty
#'Total number of Products bought by Loyalty'
df.groupby(['loyalty']).agg(total_commodity_brand=('product_id',pd.Series.nunique)).plot(figsize=(10,5),kind = 'barh', title = 'Total number of Products bought by Loyalty', legend = False);
plt.savefig('Total number of Products bought by Loyalty')
tmp_stats = df.groupby(['loyalty','basket_id']).agg(revenue = ('price',sum)).reset_index()
tmp_stats.groupby('loyalty').agg(avg_basket_amount = ('revenue','mean'),median_basket_amount = ('revenue','median'),total_basket_amount = ('revenue','sum'),number_of_baskets = ('basket_id','count'))
| avg_basket_amount | median_basket_amount | total_basket_amount | number_of_baskets | |
|---|---|---|---|---|
| loyalty | ||||
| First Time Buyer | 66.560385 | 68.855 | 3461.14 | 52 |
| Loyalist | 70.840921 | 68.870 | 99956.54 | 1411 |
| Promiscuous | 70.224487 | 68.085 | 142274.81 | 2026 |
df.groupby(['loyalty','transaction_date']).agg(total_revenue = ('price',sum)).reset_index()
data = []
for l in df.loyalty.unique():
tmp = df[df.loyalty==l].groupby(['transaction_date']).agg(revenue=('price', sum)).reset_index()
data.append(gp.Scatter(x=tmp.transaction_date, y=tmp.revenue, name = l, line=dict(dash='dash')))
gp.Figure(
data=data,
layout = gp.Layout(
title ='Revenue by year based on loyalty',width = 700,
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
tmp = df.groupby(['loyalty','transaction_date']).agg(total_revenue = ('price',sum)).reset_index()
plt.figure(figsize=(20,8))
sns.lineplot(data=tmp, x='transaction_date', y='total_revenue', hue='loyalty');
plt.xlabel('Date')
plt.ylabel('total_revenue')
plt.title('Revenue by year based on loyalty')
x_labels = pd.date_range(tmp.transaction_date.min(),tmp.transaction_date.max(),5)
plt.xticks(x_labels);
plt.savefig('Revenue by year based on loyalty')
tmp_1 = df.groupby(['loyalty', 'commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
[tmp_1[tmp_1.loyalty==l] \
.sort_values('total_revenue', ascending=False) \
.head(5) for l in tmp_1.loyalty.unique()]).reset_index(drop=True)
tmp_1.head(5)
top_5_1.head(5)
data = []
for d in top_5_1.commodity.unique():
tmp1 = top_5_1[top_5_1.commodity==d].groupby(['loyalty']).agg(revenue=('total_revenue', sum)).reset_index()
data.append(gp.Bar(x=tmp1.loyalty, y=tmp1.revenue, name = d))
gp.Figure(
data = data,
layout = gp.Layout(
title ='Top commodities per Loyalty Group',
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
topfive = df.groupby(['department',]).agg(total_revenue = ('price',sum)).sort_values('total_revenue', ascending = False).reset_index().head(5)
tmp = df[df.department.isin(topfive.department.unique())].groupby(['department','commodity']).agg(total_revenue = ('price',sum)).reset_index()
top_5c = pd.concat([tmp[tmp.department == dd].sort_values('total_revenue',ascending = False).head(5) for dd in tmp.department.unique()]).reset_index(drop = True)
top_loyalty = df.merge(top_5c[['department','commodity']], on = ['department','commodity'], how = 'inner').groupby(['loyalty','department','commodity',]).agg(unique_customers = ('customer_id',pd.Series.nunique))
top_loyalty
| unique_customers | |||
|---|---|---|---|
| loyalty | department | commodity | |
| First Time Buyer | Deli | Cheese | 9 |
| Chicken/poultry | 3 | ||
| Deli meats | 14 | ||
| Prepared food | 6 | ||
| Salad | 4 | ||
| ... | ... | ... | ... |
| Promiscuous | Produce | Apples | 463 |
| Berries | 369 | ||
| Potatoes | 472 | ||
| Salad | 550 | ||
| Tropical fruit | 893 |
75 rows × 1 columns
tmp_1 = df.groupby(['loyalty', 'commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
[tmp_1[tmp_1.loyalty==l] \
.sort_values('total_revenue', ascending=False) \
.head(5) for l in tmp_1.loyalty.unique()]).reset_index(drop=True);
plt.figure(figsize=(20,8))
sns.lineplot(data=top_5_1, x='loyalty', y='total_revenue', hue='commodity');
plt.xlabel('loyalty')
plt.ylabel('total_revenue');
plt.title('Top commodities by loyalty')
plt.savefig('Top commodities by loyalty')
fig = gp.Figure()
colors = px.colors.qualitative.Plotly
for i, cat in enumerate(top_5_1['commodity'].unique()):
dfp = top_5_1[top_5_1['commodity']==cat]
fig.add_trace(gp.Scatter(x=dfp['loyalty'], y = dfp['total_revenue'],
mode = 'markers',
marker_color = colors[i],
name = cat))
fig.show()
Analysis by Age band
#'Total number of Products bought by age_band'
df.groupby(['age_band']).agg(total_commodity_brand=('product_id',pd.Series.nunique)).plot(figsize=(10,5),kind = 'barh', title = 'Total number of Products bought by age_band', legend = False);
plt.savefig('Total number of Products bought by age_band')
tmp_stats = df.groupby(['age_band','basket_id']).agg(revenue = ('price',sum)).reset_index()
tmp_stats.groupby('age_band').agg(avg_basket_amount = ('revenue','mean'),median_basket_amount = ('revenue','median'),total_basket_amount = ('revenue','sum'),number_of_baskets = ('basket_id','count'))
| avg_basket_amount | median_basket_amount | total_basket_amount | number_of_baskets | |
|---|---|---|---|---|
| age_band | ||||
| 19-24 | 72.049567 | 69.960 | 126591.09 | 1757 |
| 25-34 | 68.540389 | 67.585 | 40575.91 | 592 |
| 35-44 | 68.865253 | 63.910 | 14943.76 | 217 |
| 45-54 | 68.990352 | 67.740 | 62712.23 | 909 |
| 55-64 | 56.148750 | 56.420 | 449.19 | 8 |
| 65 and above | 52.538750 | 48.890 | 420.31 | 8 |
#to find Revenue by year based on age_band
df.groupby(['age_band','transaction_date']).agg(total_revenue = ('price',sum)).reset_index()
data = []
for a in df.age_band.unique():
tmp = df[df.age_band==a].groupby(['transaction_date']).agg(revenue=('price', sum)).reset_index()
data.append(gp.Scatter(x=tmp.transaction_date, y=tmp.revenue, name = a, line=dict(dash='dot')))
gp.Figure(
data=data,
layout = gp.Layout(
title ='Revenue by year based on age_band',
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
tmp_1 = df.groupby(['age_band','department']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
[tmp_1[tmp_1.age_band==a] \
.sort_values('total_revenue', ascending=False) \
.head(5) for a in tmp_1.age_band.unique()]).reset_index(drop=True)
data = []
for d in top_5_1.department.unique():
tmp1 = top_5_1[top_5_1.department==d].groupby(['age_band']).agg(revenue=('total_revenue', sum)).reset_index()
data.append(gp.Bar(x=tmp1.age_band, y=tmp1.revenue, name = d))
gp.Figure(
data = data,
layout = gp.Layout(
title ='Top Departments per Age Band',
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
tmp_1 = df.groupby(['age_band','commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
[tmp_1[tmp_1.age_band==a] \
.sort_values('total_revenue', ascending=False) \
.head(5) for a in tmp_1.age_band.unique()]).reset_index(drop=True)
data = []
for c in top_5_1.commodity.unique():
tmp1 = top_5_1[top_5_1.commodity==c].groupby(['age_band']).agg(revenue=('total_revenue', sum)).reset_index()
data.append(gp.Bar(x=tmp1.revenue, y=tmp1.age_band, name = c,orientation='h'))
fig = gp.Figure(
data = data,
layout = gp.Layout(
title ='Top Commodities per Age Band',
yaxis=dict(
title='Revenue'
)
)
)
fig.update_layout(barmode='stack')
Analysis by Household Type
#'Total number of Products bought by household_type'
df.groupby(['household_type']).agg(total_commodity_brand=('product_id',pd.Series.nunique)).plot(figsize=(10,5),kind = 'barh', title = 'Total number of Products bought by household_type', legend = False);
plt.savefig('Total number of Products bought by household_type')
tmp_stats = df.groupby(['household_type','basket_id']).agg(revenue = ('price',sum)).reset_index()
tmp_stats.groupby('household_type').agg(avg_basket_amount = ('revenue','mean'),median_basket_amount = ('revenue','median'),total_basket_amount = ('revenue','sum'),number_of_baskets = ('basket_id','count'))
| avg_basket_amount | median_basket_amount | total_basket_amount | number_of_baskets | |
|---|---|---|---|---|
| household_type | ||||
| 1 adult with kids | 70.411635 | 67.825 | 59427.42 | 844 |
| 2 adults with kids | 68.680837 | 67.050 | 71428.07 | 1040 |
| 2 adults with no kids | 71.249542 | 69.675 | 54434.65 | 764 |
| Single female | 71.992895 | 70.195 | 21885.84 | 304 |
| Single male | 71.459202 | 68.690 | 38516.51 | 539 |
#to find revenue by household_type
df.transaction_date = pd.to_datetime(df.transaction_date)
df['year'] = df.transaction_date.dt.year
df['month'] = df.transaction_date.dt.month
df.groupby(['year','household_type'])['price'].sum().unstack().plot.area(figsize = (20,5), title = "Revenue by Household Type");
tmp = df.groupby(['household_type', 'year']).agg(revenue=('price', sum))
tmp
| revenue | ||
|---|---|---|
| household_type | year | |
| 1 adult with kids | 2017 | 15703.79 |
| 2018 | 16882.13 | |
| 2019 | 19892.70 | |
| 2020 | 6948.80 | |
| 2 adults with kids | 2017 | 19163.94 |
| 2018 | 22743.27 | |
| 2019 | 21233.31 | |
| 2020 | 8287.55 | |
| 2 adults with no kids | 2017 | 17618.35 |
| 2018 | 15002.01 | |
| 2019 | 15806.97 | |
| 2020 | 6007.32 | |
| Single female | 2017 | 7294.39 |
| 2018 | 5871.17 | |
| 2019 | 6011.50 | |
| 2020 | 2708.78 | |
| Single male | 2017 | 11254.15 |
| 2018 | 12121.91 | |
| 2019 | 9693.70 | |
| 2020 | 5446.75 |
tmp_1 = df.groupby(['household_type','commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
[tmp_1[tmp_1.household_type==h] \
.sort_values('total_revenue', ascending=False) \
.head(5) for h in tmp_1.household_type.unique()]).reset_index(drop=True)
data = []
for c in top_5_1.commodity.unique():
tmp1 = top_5_1[top_5_1.commodity==c].groupby(['household_type']).agg(revenue=('total_revenue', sum)).reset_index()
data.append(gp.Bar(x=tmp1.household_type, y=tmp1.revenue, name = c))
gp.Figure(
data = data,
layout = gp.Layout(
title ='Top Commodities per household_type',
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
tmp_1 = df.groupby(['household_type','commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
[tmp_1[tmp_1.household_type==h] \
.sort_values('total_revenue', ascending=True) \
.head(5) for h in tmp_1.household_type.unique()]).reset_index(drop=True)
data = []
for c in top_5_1.commodity.unique():
tmp1 = top_5_1[top_5_1.commodity==c].groupby(['household_type']).agg(revenue=('total_revenue', sum)).reset_index()
data.append(gp.Bar(x=tmp1.household_type, y=tmp1.revenue, name = c))
gp.Figure(
data = data,
layout = gp.Layout(
title ='Bottom Commodities per household_type',
yaxis=dict(
title='Revenue'
)
)
).show(renderer = 'iframe')
tmp_1 = df.groupby(['household_type', 'commodity']).agg(total_revenue=('price', sum)).reset_index()
top_5_1 = pd.concat(
[tmp_1[tmp_1.household_type==h] \
.sort_values('total_revenue', ascending=False) \
.head(5) for h in tmp_1.household_type.unique()]).reset_index(drop=True);
plt.figure(figsize=(20,8))
sns.lineplot(data=top_5_1, x='household_type', y='total_revenue', hue='commodity');
plt.xlabel('household_type')
plt.ylabel('total_revenue');
plt.title('Top commodities by household_type')
Text(0.5, 1.0, 'Top commodities by household_type')
fig = px.scatter(top_5_1, x="commodity", y="total_revenue", color="household_type",title = 'Top commodities by household_type')
fig.update_traces(marker=dict(size=10,
line=dict(width=2,
color='DarkSlateGrey')),
selector=dict(mode='markers'))
fig.show()
Revenue Analysis of Commodity and Department
#analysing frozen foods
frozen = df[df.commodity.str.lower().str.contains('frozen')].copy()
tmp = frozen.groupby(['year','month']).agg(total_revenue = ('price',sum)).reset_index()
sns.lineplot(data=tmp,x='month', y='total_revenue', hue='year' );
plt.xlabel('month')
plt.ylabel('total_revenue')
plt.title("Frozen food revenue by year")
plt.savefig("Frozen food revenue by year")
#analysing all commoties under the department deli
deli = df[df.commodity.str.lower().str.contains('Deli meats|Chicken|poultry|Prepared food|Snacks|Deli specialties|Cheese|Service beverage|Salad|Sushi|Sandwiches|Party trays|Deli supplies')].copy()
tmp = deli.groupby(['year','month']).agg(total_revenue = ('price',sum)).reset_index()
sns.lineplot(data=tmp,x='month', y='total_revenue', hue='year' );
plt.xlabel('month')
plt.ylabel('total_revenue')
plt.title("Deli revenue by year")
plt.savefig("Deli revenue by year")
Forecasting by ARIMA method
import pandas as pd
from pandas.plotting import autocorrelation_plot
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose #library for time series analysis
from statsmodels.tsa.stattools import adfuller
from statsmodels.tsa.arima.model import ARIMA
import statsmodels.api as sm
import statsmodels
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'
df = pd.read_csv("cleaned_dataset_2017_2020.csv")
df.columns
Index(['customer_id', 'product_id', 'basket_id', 'loyalty', 'household_type',
'age_band', 'department', 'brand', 'commodity', 'store',
'transaction_date', 'price'],
dtype='object')
#to change date to month beginning
df['transaction_date'] = df.transaction_date.str[:10]
df['t_date'] = pd.to_datetime(df.transaction_date)
df['t_date'] = df.t_date + pd.offsets.MonthBegin(-1)
df.head()
| customer_id | product_id | basket_id | loyalty | household_type | age_band | department | brand | commodity | store | transaction_date | price | t_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 15803 | 1131974 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | private | Baked bread/buns/rolls | 374 | 2018-10-18 | 0.99 | 2018-10-01 |
| 1 | 15803 | 1051516 | 57266 | Loyalist | 1 adult with kids | 19-24 | Produce | national | Vegetables - all others | 374 | 2018-10-18 | 0.70 | 2018-10-01 |
| 2 | 15803 | 967254 | 57266 | Loyalist | 1 adult with kids | 19-24 | Pharmaceutical | national | Cold and flu | 374 | 2018-10-18 | 1.68 | 2018-10-01 |
| 3 | 15803 | 1134222 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | private | Paper housewares | 374 | 2018-10-18 | 2.59 | 2018-10-01 |
| 4 | 15803 | 1003421 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | national | Soup | 374 | 2018-10-18 | 0.60 | 2018-10-01 |
ts = df[(df.department == 'Deli')].groupby(['t_date']).agg(total_revenue=('price', sum)).reset_index()
ts.head()
| t_date | total_revenue | |
|---|---|---|
| 0 | 2017-01-01 | 183.19 |
| 1 | 2017-02-01 | 293.15 |
| 2 | 2017-03-01 | 354.16 |
| 3 | 2017-04-01 | 393.33 |
| 4 | 2017-05-01 | 244.46 |
training = ts.loc[ts.t_date < '2020-01-01'].set_index('t_date')
training.shape
(36, 1)
training.plot();
ts_components = seasonal_decompose(training)
ts_components.plot();
# Using mean and variance to check if time series is stationary
split = round(len(training) / 2)
x1 = training[0:split]
x2 = training[split:]
mean1= x1.mean()
mean2= x2.mean()
print("Mean 1 & 2= ", mean1[0], mean2[0])
var1=x1.var()
var2=x2.var()
print("Variance 1 & 2= ",var1[0], var2[0])
Mean 1 & 2= 331.7844444444444 379.4644444444445 Variance 1 & 2= 7199.863873202616 11537.822755555557
test_adf = adfuller(training)
print('ADF test = ', test_adf[0])
print('p-value = ', test_adf[1])
ADF test = -4.851369879408998 p-value = 4.3328290099670464e-05
#Given that the ADF value is negative and p-value < 0.05, we can reject the null hyphotesis and tell that our time series is stationary. Now we can apply a forecasting method.
autocorrelation_plot(training);
test = ts.loc[ts.t_date >= '2020-01-01'].set_index('t_date')
whole = ts.set_index('t_date').squeeze().copy()
history = whole.take(range(36))
future = test.squeeze().copy()
future
t_date 2020-01-01 287.27 2020-02-01 271.27 2020-03-01 395.47 2020-04-01 430.60 2020-05-01 527.45 Name: total_revenue, dtype: float64
for t in range(len(future)):
model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(5,0,0))
model_fit = model.fit()
output = model_fit.get_forecast(steps=1)
yhat = output.predicted_mean[0].round(2)
stderr = output.se_mean[0].round(2)
confint = output.conf_int().to_numpy().round(2)
month = future.index[t]
obs = future[t].round(2)
print(month)
print('prediction:', yhat, ', expected:', obs, ', stderr:', stderr, ', conf. int:', confint)
history = whole.take(range(36 + t+1))
2020-01-01 00:00:00 prediction: 401.54 , expected: 287.27 , stderr: 92.91 , conf. int: [[219.44 583.64]] 2020-02-01 00:00:00 prediction: 315.18 , expected: 271.27 , stderr: 93.76 , conf. int: [[131.42 498.94]] 2020-03-01 00:00:00 prediction: 358.81 , expected: 395.47 , stderr: 92.51 , conf. int: [[177.49 540.13]] 2020-04-01 00:00:00 prediction: 361.86 , expected: 430.6 , stderr: 91.52 , conf. int: [[182.49 541.23]] 2020-05-01 00:00:00 prediction: 373.07 , expected: 527.45 , stderr: 91.17 , conf. int: [[194.39 551.76]]
#forecasting dor 2 years
model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(5,0,0), freq='MS')
model_fit = model.fit()
output = model_fit.get_forecast(steps=18)
output.predicted_mean
2020-06-01 375.716399 2020-07-01 317.950893 2020-08-01 363.188093 2020-09-01 375.461597 2020-10-01 391.049044 2020-11-01 368.116726 2020-12-01 347.682487 2021-01-01 356.893258 2021-02-01 362.772099 2021-03-01 364.904601 2021-04-01 361.316481 2021-05-01 356.130650 2021-06-01 357.356308 2021-07-01 359.388657 2021-08-01 359.802343 2021-09-01 359.121298 2021-10-01 357.967713 2021-11-01 358.021157 Freq: MS, Name: predicted_mean, dtype: float64
output.predicted_mean.plot();
output.conf_int().round(2)
| lower total_revenue | upper total_revenue | |
|---|---|---|
| 2020-06-01 | 194.07 | 557.36 |
| 2020-07-01 | 131.72 | 504.18 |
| 2020-08-01 | 175.41 | 550.97 |
| 2020-09-01 | 187.68 | 563.24 |
| 2020-10-01 | 203.27 | 578.83 |
| 2020-11-01 | 177.58 | 558.66 |
| 2020-12-01 | 156.43 | 538.93 |
| 2021-01-01 | 165.52 | 548.27 |
| 2021-02-01 | 171.38 | 554.16 |
| 2021-03-01 | 173.52 | 556.29 |
| 2021-04-01 | 169.84 | 552.79 |
| 2021-05-01 | 164.60 | 547.66 |
| 2021-06-01 | 165.82 | 548.89 |
| 2021-07-01 | 167.85 | 550.92 |
| 2021-08-01 | 168.27 | 551.34 |
| 2021-09-01 | 167.58 | 550.66 |
| 2021-10-01 | 166.43 | 549.51 |
| 2021-11-01 | 166.48 | 549.56 |
plt.figure(figsize=(12,5), dpi=75);
plt.plot(history, label='history');
plt.plot(future, label='future');
plt.plot(output.predicted_mean, label='forecast');
plt.title('Forecast vs Actuals for Deli');
plt.legend(loc='upper left', fontsize=8);
plt.show();
ARIMA projection for Frozen products
frozen = df[df.commodity.str.lower().str.contains('frozen')].copy()
ts = frozen.groupby(['t_date']).agg(total_revenue = ('price',sum)).reset_index()
ts.head()
| t_date | total_revenue | |
|---|---|---|
| 0 | 2017-01-01 | 542.47 |
| 1 | 2017-02-01 | 481.42 |
| 2 | 2017-03-01 | 575.11 |
| 3 | 2017-04-01 | 394.69 |
| 4 | 2017-05-01 | 529.46 |
training = ts.loc[ts.t_date < '2020-01-01'].set_index('t_date')
training.shape
(36, 1)
training.plot();
training.plot();
split = round(len(training)/2)
x1 = training[0:split]
x2 = training[split:]
mean1 = x1.mean()
mean2 = x2.mean()
print("Mean 1 & 2 = ", round(mean1[0]),round(mean2[0]))
var1 = x1.var()
var2 = x2.var()
print("Variance 1 & 2 = ", round(var1[0]),round(var2[0]))
Mean 1 & 2 = 431 411 Variance 1 & 2 = 13260 19881
test_adf = adfuller(training)
print('ADF test = ',test_adf[0])
print('p-value = ',test_adf[1])
ADF test = -3.414554075110886 p-value = 0.010472617702400572
#Given that the ADF value is negative and p-value < 0.05, we can reject the null hyphotesis and tell that our time series is stationary. Now we can apply a forecasting method.
autocorrelation_plot(training);
test = ts.loc[ts.t_date >= '2020-01-01'].set_index('t_date')
print(test)
total_revenue t_date 2020-01-01 358.70 2020-02-01 508.11 2020-03-01 602.20 2020-04-01 551.03 2020-05-01 523.47
whole = ts.set_index('t_date').squeeze().copy()
history = whole.take(range(36))
future = test.squeeze().copy()
future
t_date 2020-01-01 358.70 2020-02-01 508.11 2020-03-01 602.20 2020-04-01 551.03 2020-05-01 523.47 Name: total_revenue, dtype: float64
for t in range(len(future)):
model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(3,0,0))
model_fit = model.fit()
output = model_fit.get_forecast(steps=1)
yhat = output.predicted_mean[0].round(2)
stderr = output.se_mean[0].round(2)
confint = output.conf_int().to_numpy().round(2)
month = future.index[t]
obs = future[t].round(2)
print(month)
print('prediction:', yhat, ', expected:', obs, ', stderr:', stderr, ', conf. int:', confint)
history = whole.take(range(36 + t+1))
2020-01-01 00:00:00 prediction: 506.62 , expected: 358.7 , stderr: 117.14 , conf. int: [[277.04 736.2 ]] 2020-02-01 00:00:00 prediction: 431.37 , expected: 508.11 , stderr: 117.51 , conf. int: [[201.06 661.69]] 2020-03-01 00:00:00 prediction: 321.16 , expected: 602.2 , stderr: 116.45 , conf. int: [[ 92.92 549.41]] 2020-04-01 00:00:00 prediction: 429.39 , expected: 551.03 , stderr: 121.65 , conf. int: [[190.97 667.81]] 2020-05-01 00:00:00 prediction: 399.2 , expected: 523.47 , stderr: 121.35 , conf. int: [[161.35 637.04]]
model = ARIMA(pd.DataFrame(history).asfreq('MS'), order=(3,0,0), freq='MS')
model_fit = model.fit()
output = model_fit.get_forecast(steps=18)
output.predicted_mean.plot();
plt.figure(figsize=(12,5), dpi=75);
plt.plot(history, label='history');
plt.plot(future, label='future');
plt.plot(output.predicted_mean, label='forecast');
plt.title('Forecast vs Actuals for Frozen foods');
plt.legend(loc='upper left', fontsize=8);
plt.show();
output.conf_int().round(2)
| lower total_revenue | upper total_revenue | |
|---|---|---|
| 2020-06-01 | 164.08 | 638.98 |
| 2020-07-01 | 161.14 | 638.34 |
| 2020-08-01 | 175.35 | 654.07 |
| 2020-09-01 | 192.16 | 677.51 |
| 2020-10-01 | 193.00 | 678.49 |
| 2020-11-01 | 188.94 | 674.57 |
| 2020-12-01 | 185.31 | 671.15 |
| 2021-01-01 | 185.18 | 671.02 |
| 2021-02-01 | 186.08 | 671.94 |
| 2021-03-01 | 186.71 | 672.58 |
| 2021-04-01 | 186.71 | 672.58 |
| 2021-05-01 | 186.52 | 672.38 |
| 2021-06-01 | 186.40 | 672.27 |
| 2021-07-01 | 186.41 | 672.27 |
| 2021-08-01 | 186.45 | 672.31 |
| 2021-09-01 | 186.47 | 672.33 |
| 2021-10-01 | 186.47 | 672.33 |
| 2021-11-01 | 186.46 | 672.32 |
Forecasting using moving average method
revenue =df.groupby(['t_date']).agg(total_revenue = ('price',sum)).reset_index()
revenue.head()
| t_date | total_revenue | |
|---|---|---|
| 0 | 2017-01-01 | 5154.58 |
| 1 | 2017-02-01 | 5625.23 |
| 2 | 2017-03-01 | 6181.24 |
| 3 | 2017-04-01 | 5583.59 |
| 4 | 2017-05-01 | 5362.04 |
frozen = df[df.commodity.str.lower().str.contains('frozen')].copy()
frozen.head()
| customer_id | product_id | basket_id | loyalty | household_type | age_band | department | brand | commodity | store | transaction_date | price | t_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 5 | 15803 | 937791 | 57266 | Loyalist | 1 adult with kids | 19-24 | Grocery | national | Frozen pizza | 374 | 2018-10-18 | 2.25 | 2018-10-01 |
| 15 | 15803 | 987311 | 57266 | Loyalist | 1 adult with kids | 19-24 | Seafood | national | Seafood-frozen | 374 | 2018-10-18 | 2.50 | 2018-10-01 |
| 37 | 15811 | 1012594 | 57381 | Promiscuous | Single female | 19-24 | Nutrition | national | Frozen | 374 | 2019-10-18 | 4.00 | 2019-10-01 |
| 80 | 15818 | 1121213 | 62607 | Loyalist | 2 adults with no kids | 19-24 | Grocery | national | Frozen meat | 374 | 2017-12-18 | 3.58 | 2017-12-01 |
| 89 | 15820 | 1072843 | 52166 | Loyalist | 2 adults with kids | 25-34 | Grocery | national | Frozen bread/dough | 374 | 2019-08-18 | 3.98 | 2019-08-01 |
frozen_ts = frozen.groupby(['t_date']).agg(total_revenue=('price', sum))
frozen_ts.head(36)
| total_revenue | |
|---|---|
| t_date | |
| 2017-01-01 | 542.47 |
| 2017-02-01 | 481.42 |
| 2017-03-01 | 575.11 |
| 2017-04-01 | 394.69 |
| 2017-05-01 | 529.46 |
| 2017-06-01 | 377.45 |
| 2017-07-01 | 573.54 |
| 2017-08-01 | 623.00 |
| 2017-09-01 | 250.17 |
| 2017-10-01 | 325.93 |
| 2017-11-01 | 288.66 |
| 2017-12-01 | 493.14 |
| 2018-01-01 | 431.70 |
| 2018-02-01 | 399.87 |
| 2018-03-01 | 512.09 |
| 2018-04-01 | 342.27 |
| 2018-05-01 | 262.64 |
| 2018-06-01 | 347.35 |
| 2018-07-01 | 587.81 |
| 2018-08-01 | 555.65 |
| 2018-09-01 | 327.74 |
| 2018-10-01 | 281.44 |
| 2018-11-01 | 367.35 |
| 2018-12-01 | 415.56 |
| 2019-01-01 | 266.69 |
| 2019-02-01 | 293.93 |
| 2019-03-01 | 479.26 |
| 2019-04-01 | 655.28 |
| 2019-05-01 | 453.76 |
| 2019-06-01 | 355.10 |
| 2019-07-01 | 306.40 |
| 2019-08-01 | 457.89 |
| 2019-09-01 | 336.06 |
| 2019-10-01 | 334.36 |
| 2019-11-01 | 213.02 |
| 2019-12-01 | 715.14 |
frozen_ts.plot();
frozen_tail = frozen_ts.rolling(window=6)
moving_avg = frozen_tail.mean()
print(moving_avg.head(41))
total_revenue t_date 2017-01-01 NaN 2017-02-01 NaN 2017-03-01 NaN 2017-04-01 NaN 2017-05-01 NaN 2017-06-01 483.433333 2017-07-01 488.611667 2017-08-01 512.208333 2017-09-01 458.051667 2017-10-01 446.591667 2017-11-01 406.458333 2017-12-01 425.740000 2018-01-01 402.100000 2018-02-01 364.911667 2018-03-01 408.565000 2018-04-01 411.288333 2018-05-01 406.951667 2018-06-01 382.653333 2018-07-01 408.671667 2018-08-01 434.635000 2018-09-01 403.910000 2018-10-01 393.771667 2018-11-01 411.223333 2018-12-01 422.591667 2019-01-01 369.071667 2019-02-01 325.451667 2019-03-01 350.705000 2019-04-01 413.011667 2019-05-01 427.413333 2019-06-01 417.336667 2019-07-01 423.955000 2019-08-01 451.281667 2019-09-01 427.415000 2019-10-01 373.928333 2019-11-01 333.805000 2019-12-01 393.811667 2020-01-01 402.528333 2020-02-01 410.898333 2020-03-01 455.255000 2020-04-01 491.366667 2020-05-01 543.108333
fig, ax = plt.subplots(1,1)
frozen_ts.plot(ax=ax);
moving_avg.plot(color = 'red', ax=ax, title = "Forecast for Frozen foods")
plt.legend(['current','forecast'])
plt.show();
deli = df[(df.department == 'Deli')].copy()
deli.head()
| customer_id | product_id | basket_id | loyalty | household_type | age_band | department | brand | commodity | store | transaction_date | price | t_date | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 9 | 15803 | 845307 | 57266 | Loyalist | 1 adult with kids | 19-24 | Deli | national | Deli meats | 374 | 2018-10-18 | 3.17 | 2018-10-01 |
| 56 | 15815 | 1052729 | 52250 | Loyalist | Single female | 19-24 | Deli | national | Chicken/poultry | 374 | 2019-08-18 | 3.96 | 2019-08-01 |
| 57 | 15815 | 9524291 | 52250 | Loyalist | Single female | 19-24 | Deli | private | Prepared food | 374 | 2019-08-18 | 6.99 | 2019-08-01 |
| 82 | 15818 | 933835 | 62607 | Loyalist | 2 adults with no kids | 19-24 | Deli | national | Deli meats | 374 | 2017-12-18 | 3.11 | 2017-12-01 |
| 94 | 15820 | 1052729 | 52166 | Loyalist | 2 adults with kids | 25-34 | Deli | national | Chicken/poultry | 374 | 2019-08-18 | 3.96 | 2019-08-01 |
deli_ts = deli.groupby(['t_date']).agg(total_revenue=('price', sum))
deli_ts.plot();
train_len = 36
train = deli_ts[0:train_len]
test = deli_ts[train_len:]
y_hat_sma = deli_ts.copy()
ma_window = 6
y_hat_sma['sma_forecast'] = deli_ts['total_revenue'].rolling(ma_window).mean()
y_hat_sma['sma_forecast'][train_len:] = y_hat_sma['sma_forecast'][train_len-1]
plt.figure(figsize=(10,5));
plt.grid();
plt.plot(train['total_revenue'], label='Train');
plt.plot(test['total_revenue'], label='Test')
plt.plot(y_hat_sma['sma_forecast'], label='Simple moving average forecast');
plt.legend(loc='best');
plt.title('Moving Average Method Forecast for Deli department');
plt.show();